// 文档 https://github.com/hooke007/MPV_lazy/wiki/4_GLSL

// MIT License

// Copyright (c) 2024 Joao Chrisostomo, Kacper Michajłow

// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:

// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

//!DESC [ArtCNN_v0_C4F8_DS_CMP] (Conv2D)
//!COMPUTE 24 16 12 16
//!HOOK LUMA
//!BIND LUMA
//!SAVE conv2d
//!WIDTH LUMA.w 2.0 *
//!HEIGHT LUMA.h 1.0 *
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif

const ivec2 ksize = ivec2(3, 3);
const ivec2 offset = ksize / 2;
const ivec2 wg_size = ivec2(gl_WorkGroupSize);
const ivec2 isize = wg_size + ksize - 1;
shared F inp[1][isize.y][isize.x];
void hook() {
    const uvec2 local_xy = gl_LocalInvocationID.xy;
    ivec2 base = ivec2(gl_WorkGroupID) * wg_size;
    for (uint y = local_xy.y; y < isize.y; y += wg_size.y) {
        for (uint x = local_xy.x; x < isize.x; x += wg_size.x) {
            const ivec2 input_base = (base + ivec2(x,y) - offset) * ivec2(1, 1);
            inp[0][y][x] = F(LUMA_mul * texelFetch(LUMA_raw, input_base + ivec2(0, 0), 0).x);
        }
    }

    barrier();
    V4 result0 = V4(-0.02793066, -0.013429072, -0.10136401, -0.022102064);
    V4 result1 = V4(0.030206393, -0.022696504, -0.06041801, -0.037743982);
    const F inp_0_0_0 = inp[0][local_xy.y + 0][local_xy.x + 0];
    const F inp_0_1_0 = inp[0][local_xy.y + 0][local_xy.x + 1];
    const F inp_0_2_0 = inp[0][local_xy.y + 0][local_xy.x + 2];
    const F inp_0_0_1 = inp[0][local_xy.y + 1][local_xy.x + 0];
    const F inp_0_1_1 = inp[0][local_xy.y + 1][local_xy.x + 1];
    const F inp_0_2_1 = inp[0][local_xy.y + 1][local_xy.x + 2];
    const F inp_0_0_2 = inp[0][local_xy.y + 2][local_xy.x + 0];
    const F inp_0_1_2 = inp[0][local_xy.y + 2][local_xy.x + 1];
    const F inp_0_2_2 = inp[0][local_xy.y + 2][local_xy.x + 2];
    result0 += V4(0.057026256, -0.15283057, -0.06957069, 0.09081102) * inp_0_0_0;
    result0 += V4(0.043235768, -0.07668256, 0.22917198, -0.36189932) * inp_0_1_0;
    result0 += V4(-0.014436923, -0.20591326, -0.16687132, 0.065253444) * inp_0_2_0;
    result0 += V4(0.18254009, 0.28630742, 0.148809, -0.21916626) * inp_0_0_1;
    result0 += V4(0.42678052, 0.0047606346, 0.44425744, 0.79052305) * inp_0_1_1;
    result0 += V4(0.04518846, -0.02994846, 0.1484949, -0.076742) * inp_0_2_1;
    result0 += V4(0.06707943, 0.033338692, -0.13067316, 0.12645914) * inp_0_0_2;
    result0 += V4(0.116021335, 0.22101822, -0.18679994, -0.24275896) * inp_0_1_2;
    result0 += V4(-0.1550584, 0.0035355443, 0.13792686, -0.06643809) * inp_0_2_2;
    result1 += V4(-0.13410914, 0.20915498, -0.003327684, -0.22748086) * inp_0_0_0;
    result1 += V4(0.28180102, -0.07500254, -0.15652733, 0.09112636) * inp_0_1_0;
    result1 += V4(0.07799687, 0.22104093, -0.19359332, -0.085979804) * inp_0_2_0;
    result1 += V4(-0.09671542, 0.026963439, 0.3557882, 0.0088433195) * inp_0_0_1;
    result1 += V4(0.41383308, -0.21111634, 0.16568646, 0.23262085) * inp_0_1_1;
    result1 += V4(-0.38594046, -0.26181793, 0.031138923, -0.13756314) * inp_0_2_1;
    result1 += V4(-0.10163528, -0.01148572, -0.032071803, 0.042949863) * inp_0_0_2;
    result1 += V4(-0.018751707, 0.23158534, 0.05150989, 0.12497162) * inp_0_1_2;
    result1 += V4(-0.0071117315, -0.02669745, -0.1990601, -0.027990788) * inp_0_2_2;
    const ivec2 output_base = ivec2(gl_GlobalInvocationID) * ivec2(2, 1);
    imageStore(out_image, output_base + ivec2(0, 0), result0);
    imageStore(out_image, output_base + ivec2(1, 0), result1);
}

//!DESC [ArtCNN_v0_C4F8_DS_CMP] (Conv2D-1-ReLU)
//!COMPUTE 24 16 12 16
//!HOOK LUMA
//!BIND conv2d
//!SAVE conv2d_1
//!WIDTH LUMA.w 2.0 *
//!HEIGHT LUMA.h 1.0 *
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif

const ivec2 ksize = ivec2(3, 3);
const ivec2 offset = ksize / 2;
const ivec2 wg_size = ivec2(gl_WorkGroupSize);
const ivec2 isize = wg_size + ksize - 1;
shared V4 inp[2][isize.y][isize.x];
void hook() {
    const uvec2 local_xy = gl_LocalInvocationID.xy;
    ivec2 base = ivec2(gl_WorkGroupID) * wg_size;
    for (uint y = local_xy.y; y < isize.y; y += wg_size.y) {
        for (uint x = local_xy.x; x < isize.x; x += wg_size.x) {
            const ivec2 input_base = (base + ivec2(x,y) - offset) * ivec2(2, 1);
            inp[0][y][x] = V4(conv2d_mul * texelFetch(conv2d_raw, input_base + ivec2(0, 0), 0));
            inp[1][y][x] = V4(conv2d_mul * texelFetch(conv2d_raw, input_base + ivec2(1, 0), 0));
        }
    }

    barrier();
    V4 result0 = V4(0.10129275, -0.06414359, -0.08569356, 0.18479154);
    V4 result1 = V4(0.1026167, -0.026182352, -0.02379953, -0.084890395);
    const V4 inp_0_0_0 = inp[0][local_xy.y + 0][local_xy.x + 0];
    const V4 inp_0_1_0 = inp[0][local_xy.y + 0][local_xy.x + 1];
    const V4 inp_0_2_0 = inp[0][local_xy.y + 0][local_xy.x + 2];
    const V4 inp_0_0_1 = inp[0][local_xy.y + 1][local_xy.x + 0];
    const V4 inp_0_1_1 = inp[0][local_xy.y + 1][local_xy.x + 1];
    const V4 inp_0_2_1 = inp[0][local_xy.y + 1][local_xy.x + 2];
    const V4 inp_0_0_2 = inp[0][local_xy.y + 2][local_xy.x + 0];
    const V4 inp_0_1_2 = inp[0][local_xy.y + 2][local_xy.x + 1];
    const V4 inp_0_2_2 = inp[0][local_xy.y + 2][local_xy.x + 2];
    result0 += M4(-0.061265964, 0.090396844, 0.009725066, 0.05186913, -0.08831569, 0.14115149, 0.02030703, -0.26448712, 0.07124707, 0.28788427, -0.12266943, -0.12626326, 0.030683512, 0.24031335, 0.00847957, 0.021605743) * inp_0_0_0;
    result0 += M4(-0.17793815, 0.05877647, -0.053894203, 0.052915033, 0.13888663, 0.69158316, -0.23580156, 0.10914155, 0.1799638, -0.24527541, 0.1581725, 0.12525447, 0.02236505, -0.7791415, 0.12612069, 0.22587126) * inp_0_1_0;
    result0 += M4(-0.0044638854, 0.04594377, -0.15526178, -0.059616547, -0.113765575, 0.11526328, -0.07493729, -0.19620009, -0.054137748, -0.5558581, -0.095668636, 0.0140260225, 0.12663235, -0.06196508, 0.04590205, -0.16750298) * inp_0_2_0;
    result0 += M4(0.00605247, -0.12793037, -0.2862238, 0.21019322, -0.35067543, -0.16911429, 0.0048360247, -0.33732048, 0.042726777, 0.113944806, -0.10989531, -0.15493381, -0.23354684, -0.07596995, -0.09307506, 0.34996504) * inp_0_0_1;
    result0 += M4(0.33704638, 0.016849698, 0.23076545, 0.0038768104, 0.017964432, 0.12756151, 0.5667818, 0.05665803, 0.4572955, 0.19364744, -0.66275066, 0.06479248, 0.73848844, 0.03941161, -1.3672521, -0.4314275) * inp_0_1_1;
    result0 += M4(-0.045744356, 0.21220443, -0.0152751645, 0.0039241505, -0.1929193, -0.19463874, 0.26790416, -0.2248581, 0.039917603, -0.19291976, -0.009627005, 0.28070587, 0.45722657, 0.33621427, 0.09836808, 0.4606114) * inp_0_2_1;
    result0 += M4(0.3364754, 0.17814378, 0.15460843, -0.37382406, -0.0379763, -0.061667927, -0.1313888, 0.13502872, 0.04525791, -0.49183625, 0.20671068, 0.19864024, -0.09242534, -0.06545957, -0.0792147, 0.31940976) * inp_0_0_2;
    result0 += M4(0.4003053, 0.06277382, 0.389578, -0.10718783, -0.038713276, -1.0066761, 0.7553149, 0.2154803, 0.15907419, -0.09654794, -0.052003246, -0.3251836, -0.40904677, 0.2185564, 1.0964128, 0.32940787) * inp_0_1_2;
    result0 += M4(-0.0605001, -0.1659846, 0.07872434, 0.07011022, 0.02848638, 0.7957063, -0.022979459, 0.22006086, 0.017163865, 0.22233036, 0.031008774, -0.1164339, 0.015682636, 1.1247219, -0.06908766, 0.026357159) * inp_0_2_2;
    result1 += M4(-0.0005456019, -0.21452239, -0.02331185, -0.14731136, -0.04145214, 0.09711069, -0.028228696, 0.09707933, 0.07663301, 0.2605753, 0.06915339, 0.4213312, 0.1791276, -0.60064715, -0.09818523, 0.1204224) * inp_0_0_0;
    result1 += M4(0.08524587, 0.22597405, -0.055976626, 0.24274038, 0.15060245, 1.0501174, -0.108164474, 0.72947687, -0.15228042, -0.34146857, -0.016591283, -0.782046, -0.20293383, -0.9033482, -0.03588182, -0.7317166) * inp_0_1_0;
    result1 += M4(-0.115950175, -0.021860408, -0.012058876, 0.26047283, -0.016127544, 0.113282084, 0.12011924, 0.13397059, 0.012163716, -0.0075326, 0.0077199386, 0.07884925, -0.09810751, 0.07764409, -0.028107159, 0.081475265) * inp_0_2_0;
    result1 += M4(0.18563719, -0.02994726, -0.06645777, -0.0637833, -0.3106664, -0.22407855, -0.15867123, -0.37533087, -0.21445751, 0.20236208, -0.07749975, -0.27633286, 0.22869639, -0.07375291, -0.102342665, -0.19593965) * inp_0_0_1;
    result1 += M4(0.21929435, -0.010503337, -0.20821486, 0.103939354, -0.23477373, -0.48116106, -0.16373561, -0.6000326, 0.58716995, 0.1305978, -0.16039401, 0.090154596, 0.8233463, 0.13591775, -1.0194962, 0.5225424) * inp_0_1_1;
    result1 += M4(0.09095795, 0.031976033, 0.16921851, -0.016144704, -0.029433556, -0.004179158, -0.25524458, 0.18403143, 0.009075674, -0.20736457, 0.44877598, -0.15849751, -0.31659672, 0.8858046, 1.701505, -0.36045247) * inp_0_2_1;
    result1 += M4(0.024224568, -0.33143523, -0.010550282, 0.03448708, -0.014066454, 0.22159861, 0.23519878, 0.32013708, 0.15545098, 0.043488976, 0.08871064, 0.18172316, 0.19699459, 0.37014642, 0.10964019, 0.5172054) * inp_0_0_2;
    result1 += M4(0.27771384, 0.14050864, -0.16724731, 0.075882256, 0.2959148, 0.21900177, 0.009486253, -0.24651274, -0.15580006, -0.24721827, -0.04320085, -0.16685224, 0.16900471, 0.938665, 0.18699513, -0.12453854) * inp_0_1_2;
    result1 += M4(0.19038898, -0.09991527, 0.10804013, 0.13505149, -0.20498928, 0.29891557, 0.092341624, 0.045655612, -0.03703795, 0.28208393, 0.10003851, -0.119713984, -0.06765695, 0.043518074, 0.13848402, -0.25975037) * inp_0_2_2;
    const V4 inp_1_0_0 = inp[1][local_xy.y + 0][local_xy.x + 0];
    const V4 inp_1_1_0 = inp[1][local_xy.y + 0][local_xy.x + 1];
    const V4 inp_1_2_0 = inp[1][local_xy.y + 0][local_xy.x + 2];
    const V4 inp_1_0_1 = inp[1][local_xy.y + 1][local_xy.x + 0];
    const V4 inp_1_1_1 = inp[1][local_xy.y + 1][local_xy.x + 1];
    const V4 inp_1_2_1 = inp[1][local_xy.y + 1][local_xy.x + 2];
    const V4 inp_1_0_2 = inp[1][local_xy.y + 2][local_xy.x + 0];
    const V4 inp_1_1_2 = inp[1][local_xy.y + 2][local_xy.x + 1];
    const V4 inp_1_2_2 = inp[1][local_xy.y + 2][local_xy.x + 2];
    result0 += M4(-0.028919877, 0.06393253, -0.019392107, 0.11690062, 0.17613171, -0.02957729, -0.090882935, 0.28531885, -0.10816185, -0.1609042, 0.12529689, 0.07005133, 0.038638167, 0.4016215, -0.035899144, 0.31989735) * inp_1_0_0;
    result0 += M4(-0.195064, -0.41614702, -0.17279984, -0.054325413, -0.21735904, 0.62869173, 0.12071304, -0.2502229, -0.45668432, -0.31970868, -0.24443577, -0.038767606, -0.06957073, -0.16485514, 0.049657084, 0.02478725) * inp_1_1_0;
    result0 += M4(0.056431413, -0.18719044, -0.10719475, 0.018363247, -0.17722292, 0.2122208, 0.004645252, 0.12296303, 0.06036674, -0.12637898, 0.09256989, 0.17943135, -0.050045628, 0.105112046, 0.13389772, 0.0145468945) * inp_1_2_0;
    result0 += M4(0.2690832, -0.5292614, 0.1811737, 0.57127416, 0.13854288, -0.2258099, 0.19534212, 0.13094272, 0.24820143, -0.13425061, 0.25474843, 0.44848374, -0.19774637, -0.33498517, 0.11781168, -0.07729271) * inp_1_0_1;
    result0 += M4(0.6228834, 0.8945706, -0.7289573, 0.03317639, 0.025072597, -0.671033, 0.9283725, -0.57917243, 0.056543224, -0.216452, -0.36629507, 0.062280286, 0.04854679, 0.41887775, 0.03214725, 0.102295235) * inp_1_1_1;
    result0 += M4(0.09830952, -0.19372275, -0.26301593, -0.02701863, -0.498149, 0.19273509, 0.15852249, 0.023803055, -0.16657764, -0.33725885, -0.14257538, -0.38646558, 0.19214821, 0.078918986, 0.014341017, 0.36686295) * inp_1_2_1;
    result0 += M4(-0.03375639, -0.4789907, -0.20471029, -0.21868499, -0.008542136, -0.32063857, -0.5720973, -0.5680687, -0.11521188, -0.30227494, -0.022915341, 0.46305948, -0.053753376, 0.15432137, -0.2560017, -0.15021442) * inp_1_0_2;
    result0 += M4(0.16950598, -0.085836194, 0.5519811, -0.37358594, -0.08161773, 0.6190293, 0.027968256, 0.3453374, -0.15035647, 0.47191578, -0.8927078, 0.13530083, -0.1203021, -0.40260446, 0.40488142, -0.03589482) * inp_1_1_2;
    result0 += M4(0.18397543, -0.100823574, 0.32704225, -0.13602607, -0.06608742, 0.39020753, -0.25080413, -0.029866215, -0.042082753, -0.26607868, -0.031082256, -0.08710994, 0.18756881, -0.045936808, -0.11693739, 0.4391343) * inp_1_2_2;
    result1 += M4(0.015075238, -0.49435142, -0.051622335, -0.19568186, -0.02574259, -0.10714694, 0.06339236, 0.05981805, -0.19244118, -0.16891009, 0.01241096, -0.39899153, 0.041414715, -0.15652366, 0.0025825426, 0.0140621085) * inp_1_0_0;
    result1 += M4(-0.240981, -0.48601243, -0.07128661, -0.08457859, -0.057051737, 0.69553256, -0.37231475, 0.6955749, -0.072314344, -0.43360832, -0.13301687, -0.024822671, -0.030990753, 0.3336127, -0.024836415, 0.32274234) * inp_1_1_0;
    result1 += M4(-0.086988576, 0.10033653, 0.1301935, -0.1585245, 0.07181238, 0.18984467, 0.36798593, 0.11958539, 0.0014413635, -0.11037422, 0.18609907, -0.14116316, -0.18290721, 0.019914327, 0.14684345, 0.14587171) * inp_1_2_0;
    result1 += M4(-0.088284925, -1.0681832, 0.27054557, -1.6699473, 0.09745132, -0.67685336, -0.04002364, -1.4790534, 0.2844298, -0.49434367, 0.20378776, 0.07094354, -0.027904285, -0.1380507, -0.038403764, -0.53827345) * inp_1_0_1;
    result1 += M4(0.64849734, 0.030422026, -1.3374046, 0.58992964, -0.2635088, 0.19480492, -0.25542572, 0.018307142, -0.19051848, -0.5069711, -0.45955375, -0.18230407, 0.13111992, 0.020810999, -0.10583255, -0.0219436) * inp_1_1_1;
    result1 += M4(-0.057441, 0.16074783, 0.16930343, -0.31773144, 0.287929, 0.016437376, -0.3635953, 0.25382492, 0.23063485, 0.0045389654, -0.7762413, -0.023484955, 0.17786536, 0.14914857, 0.15387803, -0.04235477) * inp_1_2_1;
    result1 += M4(0.07807947, 0.3587233, -0.008453021, 0.46564314, 0.05921373, 0.59635323, -0.12062484, 0.47595054, -0.18499763, -0.19337143, 0.012178744, -0.4070805, -0.31207308, -0.16092423, -0.090218134, 0.08824988) * inp_1_0_2;
    result1 += M4(0.09272045, 0.2512585, -0.058250524, 0.28842798, -0.13555422, -0.33833557, 0.4602806, 0.0152045535, -0.1016229, 0.56653386, -0.21230498, 0.2605939, 0.07677343, 0.042531498, -0.09841096, -0.2941997) * inp_1_1_2;
    result1 += M4(-0.14381622, 0.14326291, 0.40332323, -0.10102165, 0.15972783, 0.20199472, -0.26654097, 0.06869106, 0.015716303, -0.6661083, -0.12832412, 0.009280587, -0.04723174, -0.047029227, 0.12782177, 0.101092346) * inp_1_2_2;
    const ivec2 output_base = ivec2(gl_GlobalInvocationID) * ivec2(2, 1);
    imageStore(out_image, output_base + ivec2(0, 0), max(result0, V4(0.0)));
    imageStore(out_image, output_base + ivec2(1, 0), max(result1, V4(0.0)));
}

//!DESC [ArtCNN_v0_C4F8_DS_CMP] (Conv2D-2-ReLU)
//!COMPUTE 24 16 12 16
//!HOOK LUMA
//!BIND conv2d_1
//!SAVE conv2d_2
//!WIDTH LUMA.w 2.0 *
//!HEIGHT LUMA.h 1.0 *
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif

const ivec2 ksize = ivec2(3, 3);
const ivec2 offset = ksize / 2;
const ivec2 wg_size = ivec2(gl_WorkGroupSize);
const ivec2 isize = wg_size + ksize - 1;
shared V4 inp[2][isize.y][isize.x];
void hook() {
    const uvec2 local_xy = gl_LocalInvocationID.xy;
    ivec2 base = ivec2(gl_WorkGroupID) * wg_size;
    for (uint y = local_xy.y; y < isize.y; y += wg_size.y) {
        for (uint x = local_xy.x; x < isize.x; x += wg_size.x) {
            const ivec2 input_base = (base + ivec2(x,y) - offset) * ivec2(2, 1);
            inp[0][y][x] = V4(conv2d_1_mul * texelFetch(conv2d_1_raw, input_base + ivec2(0, 0), 0));
            inp[1][y][x] = V4(conv2d_1_mul * texelFetch(conv2d_1_raw, input_base + ivec2(1, 0), 0));
        }
    }

    barrier();
    V4 result0 = V4(-0.0034465874, -0.079036474, 0.025276389, -0.0073453523);
    V4 result1 = V4(0.013591624, -0.15580432, -0.0015650324, -0.14402562);
    const V4 inp_0_0_0 = inp[0][local_xy.y + 0][local_xy.x + 0];
    const V4 inp_0_1_0 = inp[0][local_xy.y + 0][local_xy.x + 1];
    const V4 inp_0_2_0 = inp[0][local_xy.y + 0][local_xy.x + 2];
    const V4 inp_0_0_1 = inp[0][local_xy.y + 1][local_xy.x + 0];
    const V4 inp_0_1_1 = inp[0][local_xy.y + 1][local_xy.x + 1];
    const V4 inp_0_2_1 = inp[0][local_xy.y + 1][local_xy.x + 2];
    const V4 inp_0_0_2 = inp[0][local_xy.y + 2][local_xy.x + 0];
    const V4 inp_0_1_2 = inp[0][local_xy.y + 2][local_xy.x + 1];
    const V4 inp_0_2_2 = inp[0][local_xy.y + 2][local_xy.x + 2];
    result0 += M4(0.22912103, 0.09105632, -0.025404993, -0.005314299, -1.3153944, -0.035125285, 0.0024964593, -0.06951302, -1.3843726, -0.030622572, -0.003954646, 0.04768716, -0.99095553, 0.07380039, 0.010352862, -0.062900595) * inp_0_0_0;
    result0 += M4(0.22423756, 0.12364775, 0.01946751, 0.3283192, -0.056606673, 0.06666786, 0.030980079, 0.32059258, -0.060214326, -0.93854856, 0.02242121, -0.88109, -0.24108063, -0.021870721, -0.07140059, 0.11170756) * inp_0_1_0;
    result0 += M4(-0.062845156, 0.035209425, -0.06303984, -0.10082241, 0.066033736, 0.108851925, 0.0043006474, -0.11628237, -0.011160544, -0.028099207, -0.026879951, -0.36184248, -0.39494818, -0.060878422, -0.0033392266, -0.23139606) * inp_0_2_0;
    result0 += M4(0.5443439, 0.30595094, -0.042832755, -0.15526743, -0.07133167, 0.13151437, -0.09530897, 0.10046027, 0.0155177405, 0.076852895, 0.027968956, 0.072254226, -0.08629247, 0.14945002, 0.06699406, 0.116390675) * inp_0_0_1;
    result0 += M4(0.24621738, -0.01371094, 0.88144696, 0.56907856, 0.04231563, 0.08061371, 0.028530035, -0.22426939, -0.09081422, -1.4205712, -0.025353098, 0.00853732, 0.20624034, 0.47733393, -0.030515354, 0.19840188) * inp_0_1_1;
    result0 += M4(0.13206916, 0.060304537, -0.18602288, 0.05516314, -0.043583304, 0.06654624, -0.00059487426, -0.06514846, 0.028158398, 0.003506506, -0.000682995, 0.037776552, 0.07556124, 0.055103365, -0.024982406, -0.1488586) * inp_0_2_1;
    result0 += M4(-0.30931398, 0.287899, 0.049758483, -0.010997782, 0.08125986, 0.14587112, -0.06391396, 0.08690046, 0.052382447, 0.04065286, 0.008235151, 0.019142874, -0.0040573115, 0.13682128, 0.009873448, 0.042159908) * inp_0_0_2;
    result0 += M4(0.2857843, 0.5293818, 0.037131947, 0.26617676, -0.060609605, 0.042696986, 0.023416908, 0.07863082, -0.061399084, -0.002501784, 0.0064091496, 0.051093455, 0.06586543, -0.011243571, -0.010859356, -0.034067668) * inp_0_1_2;
    result0 += M4(-0.096091315, 0.026602767, 0.05015067, -0.015912315, -0.030859442, 0.065069824, 0.0032723104, 0.065567896, 0.05270799, 0.017436722, -0.014831356, 0.042770274, 0.056945927, -0.028664406, 0.010296687, 0.039703216) * inp_0_2_2;
    result1 += M4(-0.3647073, -0.1877831, -0.18343303, -0.12622787, -0.28043473, -0.10243967, -0.1492094, 0.052043095, -0.13419518, 0.10606568, -0.09947906, -0.1801593, 0.13169923, 0.07323866, -0.018481843, 0.14573799) * inp_0_0_0;
    result1 += M4(-0.40681902, -0.040143244, -0.0130191995, 0.0245515, -0.3531637, -0.08130964, -0.12361341, 0.054996345, 0.19632003, 0.076899834, 0.12194427, -0.6763197, 0.099606946, 0.11638685, 0.030441059, 0.10328442) * inp_0_1_0;
    result1 += M4(-0.05095857, 0.20681822, -0.022873113, 0.050000485, -0.10196304, 0.027042165, -0.06544703, 0.11296282, 0.11043819, -0.6474361, -0.15849607, -0.06044965, -0.029342428, 0.30745754, 0.07467549, 0.16123064) * inp_0_2_0;
    result1 += M4(0.41906917, -0.011245052, -0.26884058, 0.20049511, -0.3738161, -0.10313803, -0.27504876, 0.28688076, -0.16100776, 0.027700689, -0.19163373, -0.23014504, -0.14638221, 0.05579441, 0.10570992, 0.08801736) * inp_0_0_1;
    result1 += M4(0.30147532, 0.11350887, 0.30739275, -0.63574046, -0.72044915, 0.09812565, -0.075364456, 0.07721041, 0.19979739, -0.44766888, 0.18335406, -0.51229286, -0.36800867, -0.02594683, -0.09355402, 0.923733) * inp_0_1_1;
    result1 += M4(-0.09772773, -0.07243453, 0.01873751, -0.13747643, -0.3475008, 0.025457017, -0.028309438, -0.024790363, -0.08691439, -0.39055392, -0.13366286, -0.060083576, 0.14139567, 0.9152149, -0.07499597, 0.2355989) * inp_0_2_1;
    result1 += M4(-0.15817338, -0.12194576, -0.5126775, 0.020087004, -0.30001676, 0.0511159, 0.018671507, 0.04873863, -0.037620906, 0.016364167, -0.03230109, 0.058981325, -0.027865624, 0.13155684, -0.060460016, 0.030369835) * inp_0_0_2;
    result1 += M4(-0.5373837, -0.18058442, -0.15060969, 0.71042055, -0.31937096, -0.059212293, 0.049846943, 0.026485777, -0.0043230103, -0.058103852, 0.00048320185, 0.012475382, 0.07848543, -0.17651975, 0.07930786, -0.02322977) * inp_0_1_2;
    result1 += M4(0.2200906, 0.41060942, 0.10256321, -0.16442251, -0.15654956, 0.09105371, 0.04095651, 0.035051964, -0.08579494, -0.052027196, 0.023165444, -0.0065150005, 0.07613921, 0.161168, 0.09237374, 0.039495982) * inp_0_2_2;
    const V4 inp_1_0_0 = inp[1][local_xy.y + 0][local_xy.x + 0];
    const V4 inp_1_1_0 = inp[1][local_xy.y + 0][local_xy.x + 1];
    const V4 inp_1_2_0 = inp[1][local_xy.y + 0][local_xy.x + 2];
    const V4 inp_1_0_1 = inp[1][local_xy.y + 1][local_xy.x + 0];
    const V4 inp_1_1_1 = inp[1][local_xy.y + 1][local_xy.x + 1];
    const V4 inp_1_2_1 = inp[1][local_xy.y + 1][local_xy.x + 2];
    const V4 inp_1_0_2 = inp[1][local_xy.y + 2][local_xy.x + 0];
    const V4 inp_1_1_2 = inp[1][local_xy.y + 2][local_xy.x + 1];
    const V4 inp_1_2_2 = inp[1][local_xy.y + 2][local_xy.x + 2];
    result0 += M4(-2.6741908, 0.090733044, 0.018075056, 0.009288931, -2.3032477, 0.012210743, 0.027604694, 0.0076998253, -2.9115663, 0.05500784, 0.041242838, 0.0007728169, -0.009156706, 0.036516193, -0.018478658, 0.07925723) * inp_1_0_0;
    result0 += M4(-0.01963526, -0.33473113, -0.07147435, -0.7078566, 0.016246386, -0.10183415, -0.061728954, 0.22880243, 0.025434671, 0.03904173, 0.037744578, -0.35376918, 0.07743849, 0.11648018, -0.035661727, -0.28257105) * inp_1_1_0;
    result0 += M4(-0.47149763, -0.10343948, -0.0951253, -0.21205373, -0.018966667, 0.016966231, -0.017401772, -0.03181646, 0.06452831, 0.033105943, 0.01151953, 0.033790663, 0.13000047, 0.021499427, -0.05221461, -0.24746536) * inp_1_2_0;
    result0 += M4(-0.22085422, -0.61037904, 0.05049636, 0.04544393, 0.25563976, 0.048538987, -0.0048596053, -0.10088073, -0.70384496, -1.2879804, 0.015923912, -0.30103382, -0.070138685, -0.062528536, -0.019949518, 0.17826852) * inp_1_0_1;
    result0 += M4(0.4541996, -0.47192723, 0.41748068, -0.11121847, 0.123461396, 0.50839, -0.022422738, 0.11082274, -0.054999802, -1.2645166, -0.044638436, -0.6382176, 0.31587914, 1.7256684, 0.08508813, 0.6796993) * inp_1_1_1;
    result0 += M4(-0.08517383, 0.32122502, 0.09716052, 0.255638, -0.119240046, 0.054865807, -0.0057714144, 0.3559751, -0.028650917, 0.020195812, 0.025027255, -0.042046513, -0.12853204, -0.054128084, 0.019088654, -0.10106185) * inp_1_2_1;
    result0 += M4(0.07195087, -0.30003893, -0.10408939, 0.034646768, 0.08448458, 0.015891004, 0.032937046, -0.059976913, 0.011584458, -0.13791701, 0.041595113, 0.036061373, -0.19855681, -0.14309041, 0.0093324, 0.049287856) * inp_1_0_2;
    result0 += M4(0.18261294, -0.03259465, -0.020631952, -0.40801224, 0.058284476, 0.26670218, 0.013843976, -0.09572007, 0.046960983, -0.01910698, 0.0809777, 0.017899925, -0.14989883, -0.023020748, -0.073753245, 0.27992782) * inp_1_1_2;
    result0 += M4(-0.015727114, -0.09304231, 0.08964175, 0.06741707, 0.018954782, -0.097811535, -0.016614001, 0.010884762, -0.00051851716, 0.023995098, 0.0027733713, 0.037848745, 0.030632988, -0.12688828, -0.07055521, 0.045997698) * inp_1_2_2;
    result1 += M4(0.19463658, 0.23522787, -0.036773816, 0.17221327, -0.006007494, 0.001070713, -0.12512675, 0.23444583, -0.24611008, 0.11847361, -0.044446073, -0.022582166, 0.008311397, 0.08350622, 0.21392798, -0.15430632) * inp_1_0_0;
    result1 += M4(0.73981994, 0.064847544, 0.42952776, -0.10233466, 0.24658886, 0.07354849, 0.0084008835, 0.05433974, -0.089352496, 0.086922735, 0.0070303567, 0.13484779, 0.16367406, -0.12436132, 0.1982714, -0.04903244) * inp_1_1_0;
    result1 += M4(0.042958375, -0.45629272, -0.17741151, 0.08527364, 0.09061999, 0.012951603, 0.052891973, 0.00452325, 0.01507567, -0.12510586, -0.0069022463, 0.011380625, 0.0981156, 0.14886613, 0.018655721, -0.082381785) * inp_1_2_0;
    result1 += M4(-0.026926579, -0.2271819, 0.6266378, -0.63868165, -0.020310247, -0.058518015, -0.054738104, 0.20529555, 0.08260606, 0.24069415, 0.35473198, -0.6288541, 0.33195314, 0.071527965, 0.39613578, -0.30524215) * inp_1_0_1;
    result1 += M4(0.07508051, 0.17296657, -0.44525146, -0.12533702, 0.15290725, 0.29777512, -0.06609803, 0.31893352, 0.065189734, -1.3782824, 0.018296422, -0.12671009, -0.0259192, -0.19671743, -0.15622373, 0.54027694) * inp_1_1_1;
    result1 += M4(-0.3875062, -0.40308774, -0.1008696, 0.21796286, 0.2100782, 0.55421674, 0.12313819, 0.045119386, -0.046736587, -0.27375945, -0.03749772, 0.05291287, 0.16148625, 0.9635284, 0.24951014, 0.050540145) * inp_1_2_1;
    result1 += M4(-0.0029134427, 0.1359645, 0.40379474, 0.35335627, 0.12757972, -0.06454243, -0.065683365, 0.039409272, 0.12427507, 0.16813534, 0.023521964, 0.10082277, 0.24827579, 0.07954877, 0.23682417, -0.17225721) * inp_1_0_2;
    result1 += M4(-0.22437626, 0.25074735, -0.02502282, 0.30681825, 0.32115448, 0.012076249, -0.09231427, 0.057385966, -0.18486787, 0.13957636, 0.06090384, 0.11152896, 0.16529459, -0.2417966, 0.14630775, -0.18540476) * inp_1_1_2;
    result1 += M4(0.32615438, 0.19124036, 0.08615849, -0.15447555, 0.43169418, -0.1375408, 0.020761244, -0.03039656, -0.035095118, -0.048039336, -0.0040607215, 0.029283522, 0.1794816, -0.4203999, 0.13541597, 0.05761537) * inp_1_2_2;
    const ivec2 output_base = ivec2(gl_GlobalInvocationID) * ivec2(2, 1);
    imageStore(out_image, output_base + ivec2(0, 0), max(result0, V4(0.0)));
    imageStore(out_image, output_base + ivec2(1, 0), max(result1, V4(0.0)));
}

//!DESC [ArtCNN_v0_C4F8_DS_CMP] (Conv2D-3-ReLU)
//!COMPUTE 24 16 12 16
//!HOOK LUMA
//!BIND conv2d_2
//!SAVE conv2d_3
//!WIDTH LUMA.w 2.0 *
//!HEIGHT LUMA.h 1.0 *
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif

const ivec2 ksize = ivec2(3, 3);
const ivec2 offset = ksize / 2;
const ivec2 wg_size = ivec2(gl_WorkGroupSize);
const ivec2 isize = wg_size + ksize - 1;
shared V4 inp[2][isize.y][isize.x];
void hook() {
    const uvec2 local_xy = gl_LocalInvocationID.xy;
    ivec2 base = ivec2(gl_WorkGroupID) * wg_size;
    for (uint y = local_xy.y; y < isize.y; y += wg_size.y) {
        for (uint x = local_xy.x; x < isize.x; x += wg_size.x) {
            const ivec2 input_base = (base + ivec2(x,y) - offset) * ivec2(2, 1);
            inp[0][y][x] = V4(conv2d_2_mul * texelFetch(conv2d_2_raw, input_base + ivec2(0, 0), 0));
            inp[1][y][x] = V4(conv2d_2_mul * texelFetch(conv2d_2_raw, input_base + ivec2(1, 0), 0));
        }
    }

    barrier();
    V4 result0 = V4(-0.0033592475, -0.0014388481, 0.07497558, -0.030383488);
    V4 result1 = V4(0.0036725544, 0.005904786, -0.0049021845, -0.029746184);
    const V4 inp_0_0_0 = inp[0][local_xy.y + 0][local_xy.x + 0];
    const V4 inp_0_1_0 = inp[0][local_xy.y + 0][local_xy.x + 1];
    const V4 inp_0_2_0 = inp[0][local_xy.y + 0][local_xy.x + 2];
    const V4 inp_0_0_1 = inp[0][local_xy.y + 1][local_xy.x + 0];
    const V4 inp_0_1_1 = inp[0][local_xy.y + 1][local_xy.x + 1];
    const V4 inp_0_2_1 = inp[0][local_xy.y + 1][local_xy.x + 2];
    const V4 inp_0_0_2 = inp[0][local_xy.y + 2][local_xy.x + 0];
    const V4 inp_0_1_2 = inp[0][local_xy.y + 2][local_xy.x + 1];
    const V4 inp_0_2_2 = inp[0][local_xy.y + 2][local_xy.x + 2];
    result0 += M4(-0.047382455, 0.025344469, 0.0181171, 0.036657475, 0.19789238, -0.084112056, 0.20910491, 0.005659895, 0.15431362, 0.016353851, 0.079063535, -0.0005728114, 0.1770398, -0.07231596, 0.042208716, 0.003223509) * inp_0_0_0;
    result0 += M4(-0.005198564, 0.07243749, -0.12895095, 0.03776277, -0.37100038, -0.20020448, 0.0130324755, -0.16371098, -0.46365273, -0.05421045, -0.07652052, -0.1359629, -0.19953682, -0.095216654, -0.10193687, -0.034134243) * inp_0_1_0;
    result0 += M4(0.110571355, 0.042309392, 0.112033084, -0.030182498, -0.11258857, -0.13477877, 0.15809436, -0.1060642, 0.07806519, -0.022788458, -0.010089714, 0.022634378, -0.021467546, 0.0031064816, 0.040337462, 0.02108241) * inp_0_2_0;
    result0 += M4(0.18573526, 0.02107372, 0.045093484, 3.6858e-05, -0.50279397, 0.10499706, 0.16375375, 0.10734359, -0.4146881, 0.009436764, -0.081162125, 0.034151454, 0.032900818, -0.03328517, 0.06350131, -0.004709188) * inp_0_0_1;
    result0 += M4(-0.21013924, -0.13829933, 0.28593224, -0.14997925, -0.8192081, 0.49219483, -1.1844217, 0.18712105, 0.6159649, 0.6330274, 0.19586581, 0.6548463, -0.35479024, 0.0058147167, -0.48347506, 0.011505747) * inp_0_1_1;
    result0 += M4(-0.016707892, 0.11533008, -0.02007723, -0.41508994, 0.20079564, -0.12343321, -0.45939383, 0.07339779, 0.0023067514, -0.13386646, -0.022290798, -0.008269604, -0.042164344, -0.19538392, -0.03751506, 0.00080294436) * inp_0_2_1;
    result0 += M4(-0.08782037, -0.032432616, -0.033071306, -0.028009225, -0.06257248, 0.0028636009, 0.032183226, -0.008844664, 0.012222819, -0.26866886, -0.24305214, 0.04857716, -0.016061323, -0.14422923, -0.03495442, -0.026271263) * inp_0_0_2;
    result0 += M4(-0.064186856, -0.34323105, -0.40123945, -0.0016100989, 0.055074163, 0.45858273, 0.051816877, 0.1474477, 0.087160185, -0.39144516, 0.2935346, 0.017279387, -0.09503551, 0.11648162, -0.21098037, 0.12650476) * inp_0_1_2;
    result0 += M4(-1.3971008, -2.384039, -0.2498563, 0.3674127, 0.04558076, 0.35731494, -0.05408538, 0.08428405, -0.0836945, 0.21034922, -0.028095214, 0.02335078, 0.041505173, -0.053499777, 0.066727325, -0.020545697) * inp_0_2_2;
    result1 += M4(-0.009771909, -0.008246766, 0.045180075, 0.07376309, -0.14354943, -0.035382163, -0.0828788, -0.0136913555, 0.036681015, -0.0021292868, -0.09257129, 0.010466442, 0.04329196, -0.056212313, 0.10413494, -0.019296508) * inp_0_0_0;
    result1 += M4(-0.0009970729, 0.039378043, 0.052182745, -0.039891273, 0.088978276, 0.24688667, 0.045282334, 0.28672376, -0.028830308, 0.15652381, -0.069095634, -0.044008356, 0.018903816, -0.025395295, 0.16179283, 0.027743176) * inp_0_1_0;
    result1 += M4(-0.031285092, 0.14096266, -0.26551536, 0.02822387, 0.0326168, -0.048688993, -0.11440276, 0.0035761318, 0.025082238, -0.03747916, -0.016897269, 0.0016025803, 0.021633232, -0.23817872, 0.14353456, -0.13916425) * inp_0_2_0;
    result1 += M4(-0.09142695, 0.018557668, -0.15553501, 0.041643713, 0.0172274, 0.05636893, 0.22200873, -0.026947595, -0.043887008, -0.07083508, 0.046704277, -0.2094153, 0.27494004, 0.11447262, 0.24280621, 0.021048661) * inp_0_0_1;
    result1 += M4(-0.14625269, -0.17340726, -0.21486181, -0.18465242, -0.09317577, 0.6173361, 0.13011597, 0.33609286, 0.5835587, -0.05495468, 0.6358209, 0.5731625, 0.16007307, 0.53648245, 0.30973378, 0.36147097) * inp_0_1_1;
    result1 += M4(0.037308723, 0.8912002, -0.34182176, 0.92621887, 0.09126665, 0.34491128, 0.16031723, 0.1840575, -0.24529389, 0.102854624, 0.057091415, 0.08290692, -0.1455217, 0.031668186, 0.23852387, 0.010908772) * inp_0_2_1;
    result1 += M4(0.15588187, -0.04938782, 0.11696999, -0.04598636, -0.40455747, -0.017237816, -0.35576594, -0.052758254, -0.5589744, 0.07721081, 0.11780853, 0.033673868, 0.33094007, -0.035838027, -0.15215519, -0.0021062398) * inp_0_0_2;
    result1 += M4(-0.6615521, 0.0516843, 0.29479, -0.051623844, -1.3020104, -0.060925566, -0.3276319, -0.08529174, -0.017384076, -0.21312267, -0.65302426, 0.061371323, -1.0188646, 0.0130816875, -0.2897414, 0.04984787) * inp_0_1_2;
    result1 += M4(-1.279481, -0.92789495, -0.9963089, -0.26990554, 0.11681462, -0.10300394, -0.41189778, -0.10338683, 0.25029403, 0.017431812, -0.043342497, -0.09488991, 0.20365374, -0.05146486, -0.18754682, -0.016855955) * inp_0_2_2;
    const V4 inp_1_0_0 = inp[1][local_xy.y + 0][local_xy.x + 0];
    const V4 inp_1_1_0 = inp[1][local_xy.y + 0][local_xy.x + 1];
    const V4 inp_1_2_0 = inp[1][local_xy.y + 0][local_xy.x + 2];
    const V4 inp_1_0_1 = inp[1][local_xy.y + 1][local_xy.x + 0];
    const V4 inp_1_1_1 = inp[1][local_xy.y + 1][local_xy.x + 1];
    const V4 inp_1_2_1 = inp[1][local_xy.y + 1][local_xy.x + 2];
    const V4 inp_1_0_2 = inp[1][local_xy.y + 2][local_xy.x + 0];
    const V4 inp_1_1_2 = inp[1][local_xy.y + 2][local_xy.x + 1];
    const V4 inp_1_2_2 = inp[1][local_xy.y + 2][local_xy.x + 2];
    result0 += M4(-0.02940392, 0.06369084, -0.022564769, 0.032406695, -0.101567365, 0.024062146, -0.028558644, 0.017370543, 0.041567933, -0.004310982, 0.0050648656, 0.0011296376, 0.10800749, 0.09994143, 0.13193645, 0.016811121) * inp_1_0_0;
    result0 += M4(0.26680955, 0.2710635, -0.03750561, 0.17480572, 0.06775689, 0.14861335, 0.06486993, 0.08584792, 0.032451753, -0.06702701, 0.042381134, -0.051514905, 0.175047, -0.014754477, -0.09557878, -0.0831153) * inp_1_1_0;
    result0 += M4(0.2643021, -0.04307404, 0.052555572, 0.030648623, 0.11126605, -0.029577984, 0.117089204, 0.020451976, -0.060505223, -0.046240855, -0.06922756, -0.022307338, -0.013384765, 0.071477525, -0.038503025, 0.010551289) * inp_1_2_0;
    result0 += M4(0.18686187, 0.17204565, -0.2415474, 0.1002066, 0.12625182, 0.08970062, -0.022583384, 0.039813917, -0.113755055, -0.009049383, 0.07099008, -0.017397083, 0.016404182, 0.040644065, 0.136118, -0.028748268) * inp_1_0_1;
    result0 += M4(0.23942399, -0.028241066, 0.24933232, 0.06819488, 0.11138149, 0.37768012, 0.26660925, 0.02452422, -0.008121426, -0.45900872, 0.21924867, -0.15090448, -0.38364422, -0.12504894, -0.09441487, 0.016214145) * inp_1_1_1;
    result0 += M4(0.1623815, 0.089433454, -0.11820267, 0.014907133, 0.060249213, 0.10864594, 0.044479128, -0.026696099, -0.10019558, 0.05204227, 0.09249716, 0.08754035, -0.051470626, -0.24278083, 0.075328715, -0.055121176) * inp_1_2_1;
    result0 += M4(0.12100287, 0.1263055, 0.1614513, -0.0037556137, 0.029281788, -0.009055272, 0.016714582, -0.0067505585, 0.046804734, -0.05794363, -0.10932849, 0.016610224, 0.006643504, 0.18346037, 0.05882119, -0.037053052) * inp_1_0_2;
    result0 += M4(0.05564799, -0.094097175, 0.04650456, 0.042696677, 0.030604092, -0.051149283, 0.011907597, 0.038031437, -0.054455575, 0.37694764, 0.20946856, -0.02663509, -0.15572976, -0.1788365, -0.35169667, -0.0010356593) * inp_1_1_2;
    result0 += M4(0.00088367774, 0.17453815, 0.035771314, 0.010959557, -0.07931933, 0.19718485, 0.10140203, -0.04118182, 0.04234803, -0.085674345, 0.100996785, 0.0054446487, 0.10789721, 0.2384204, -0.08798578, 0.008700553) * inp_1_2_2;
    result1 += M4(-0.12653263, 0.004405884, 0.052246116, -0.021044966, 0.0032004945, 0.022122849, 0.033640653, 0.009701084, 0.088971496, -0.010906008, -0.037774965, 0.014396962, 0.062230583, 0.034671582, 0.045309253, 0.043610834) * inp_1_0_0;
    result1 += M4(0.18973444, 0.05120935, 0.22673985, -0.04913223, 0.13955683, -0.0068015363, 0.100301735, 0.042273633, 0.017822132, 0.03835167, 0.051225726, 0.07273987, 0.12566234, 0.18629858, 0.37267902, -0.13469224) * inp_1_1_0;
    result1 += M4(-0.17861721, 0.005809094, 0.07758347, 0.08741594, -0.06235323, -0.03450685, -0.00552224, 0.041994065, -0.03158233, 0.051622845, 0.0039942446, 0.14311121, 0.14694613, 0.04616712, 0.09610687, -0.043587025) * inp_1_2_0;
    result1 += M4(-0.07450373, 0.2248041, 0.05718081, 0.19759364, 0.089411914, 0.089547835, 0.045496203, 0.08389516, 0.10786909, -0.038745705, -0.06042096, -0.09526875, 0.009795133, 0.026236204, 0.09585282, 0.04806147) * inp_1_0_1;
    result1 += M4(-0.63180786, 0.17878646, -0.29181394, 0.15294215, 0.08455865, 0.5187849, -0.16078512, 0.1025678, -0.32471687, -0.3166111, -0.31447172, 0.22136185, -0.7895637, 0.39729077, -0.67116666, 0.06523181) * inp_1_1_1;
    result1 += M4(-0.29990792, 0.27913773, 0.0868504, 0.16180223, 0.074843474, 0.17685777, -0.07135512, 0.0978162, 0.002066347, -0.2836423, 0.0951881, -0.047261804, -0.16210654, -0.20643821, -0.1867811, -0.08736012) * inp_1_2_1;
    result1 += M4(-0.24924764, -0.0008111519, 0.08319525, 0.015240895, -0.10313035, -0.051989622, -0.23222107, -0.03536586, -0.10352127, 0.008292034, 0.04721943, -0.00065496325, 0.22746576, -0.12374214, -0.23014174, -0.048432473) * inp_1_0_2;
    result1 += M4(-0.4535944, 0.12770292, 0.20552176, -0.036127154, -0.26561648, -0.041770175, -0.083157055, -0.11026962, 0.8672746, 0.03757752, -0.032341328, 0.03445527, -0.9478492, 0.13127916, -0.69294626, 0.02380626) * inp_1_1_2;
    result1 += M4(0.0590312, 0.1763282, -0.10495802, 0.020883244, 0.23956378, 0.066596694, -0.13370325, -0.0070890966, 0.32053608, -0.23686002, -0.0012269815, -0.020056529, -0.07032632, 0.105966575, -0.24962346, 0.027026137) * inp_1_2_2;
    const ivec2 output_base = ivec2(gl_GlobalInvocationID) * ivec2(2, 1);
    imageStore(out_image, output_base + ivec2(0, 0), max(result0, V4(0.0)));
    imageStore(out_image, output_base + ivec2(1, 0), max(result1, V4(0.0)));
}

//!DESC [ArtCNN_v0_C4F8_DS_CMP] (Conv2D-4-ReLU)
//!COMPUTE 24 16 12 16
//!HOOK LUMA
//!BIND conv2d_3
//!SAVE conv2d_4
//!WIDTH LUMA.w 2.0 *
//!HEIGHT LUMA.h 1.0 *
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif

const ivec2 ksize = ivec2(3, 3);
const ivec2 offset = ksize / 2;
const ivec2 wg_size = ivec2(gl_WorkGroupSize);
const ivec2 isize = wg_size + ksize - 1;
shared V4 inp[2][isize.y][isize.x];
void hook() {
    const uvec2 local_xy = gl_LocalInvocationID.xy;
    ivec2 base = ivec2(gl_WorkGroupID) * wg_size;
    for (uint y = local_xy.y; y < isize.y; y += wg_size.y) {
        for (uint x = local_xy.x; x < isize.x; x += wg_size.x) {
            const ivec2 input_base = (base + ivec2(x,y) - offset) * ivec2(2, 1);
            inp[0][y][x] = V4(conv2d_3_mul * texelFetch(conv2d_3_raw, input_base + ivec2(0, 0), 0));
            inp[1][y][x] = V4(conv2d_3_mul * texelFetch(conv2d_3_raw, input_base + ivec2(1, 0), 0));
        }
    }

    barrier();
    V4 result0 = V4(-0.034818295, -0.02108715, -0.014165068, 0.008286487);
    V4 result1 = V4(-0.021342369, -0.021136617, -0.039871223, -0.08978403);
    const V4 inp_0_0_0 = inp[0][local_xy.y + 0][local_xy.x + 0];
    const V4 inp_0_1_0 = inp[0][local_xy.y + 0][local_xy.x + 1];
    const V4 inp_0_2_0 = inp[0][local_xy.y + 0][local_xy.x + 2];
    const V4 inp_0_0_1 = inp[0][local_xy.y + 1][local_xy.x + 0];
    const V4 inp_0_1_1 = inp[0][local_xy.y + 1][local_xy.x + 1];
    const V4 inp_0_2_1 = inp[0][local_xy.y + 1][local_xy.x + 2];
    const V4 inp_0_0_2 = inp[0][local_xy.y + 2][local_xy.x + 0];
    const V4 inp_0_1_2 = inp[0][local_xy.y + 2][local_xy.x + 1];
    const V4 inp_0_2_2 = inp[0][local_xy.y + 2][local_xy.x + 2];
    result0 += M4(-0.120068885, -0.010642304, 0.019320194, -0.046139285, 0.13971561, 0.009605191, 0.5416973, -0.0879012, -0.058981802, -0.003479058, 0.10930643, -0.07244141, -0.17652874, 0.025010146, -0.30108804, 0.18976584) * inp_0_0_0;
    result0 += M4(0.14867604, 0.021870252, -0.036030855, -0.048599303, 0.41044092, -0.014084468, 0.04070413, 0.004935904, -0.13889274, -0.017549517, -0.011451426, 0.11533184, -0.5359673, -0.04343586, 0.051145755, -0.021043256) * inp_0_1_0;
    result0 += M4(-0.029872505, 0.00089386344, -0.051546164, 0.03391047, 0.12858436, 0.00017431616, 0.0048978063, 0.026311345, -0.15823978, -0.01621557, 0.027321398, -0.015988123, -0.13520166, 0.0069915047, -0.01195084, 0.015853617) * inp_0_2_0;
    result0 += M4(0.092854284, -0.017413152, 0.23834416, 0.00067862816, 0.21732827, 0.02643737, 0.7367817, 0.31924826, 0.061871614, 0.01227259, -0.045815956, 0.071689226, 0.1254306, -0.017359447, 0.1235184, -0.41852865) * inp_0_0_1;
    result0 += M4(0.17595413, -0.010719707, 0.0068651196, -0.3581599, 0.23194231, -0.09961419, -0.02377718, 0.3597188, 0.25878415, -0.01812122, 0.0863948, -0.31176662, 0.89323723, 0.21346122, 0.2745847, -0.42067942) * inp_0_1_1;
    result0 += M4(-0.10033504, 0.057273034, -0.04238532, 0.09439134, 0.120934956, 0.10621879, -0.040194675, 0.04194932, -0.023418473, 0.1238368, -0.058718715, -0.06516261, 0.02037498, -0.23810127, -0.008571598, 0.033223264) * inp_0_2_1;
    result0 += M4(-0.017195199, -0.0614686, -2.0832696, 0.07460092, -0.11923172, 0.024614496, -2.629726, -0.007838448, 0.09647502, 0.010661809, -0.43024495, -0.09496695, 0.046894938, 0.059043687, -4.9755344, 0.3606515) * inp_0_0_2;
    result0 += M4(0.33000973, 0.03745177, -0.2119929, 0.5356069, -0.089091696, 0.012666864, -0.03579292, -0.030632995, 0.06261729, 0.092487, 0.05551889, 0.1129475, -0.035364818, 0.20677766, 0.29729927, 0.56670374) * inp_0_1_2;
    result0 += M4(-0.0049111648, 0.15045097, 0.09226742, 0.10048135, -0.06438242, 0.018716661, -0.0042132013, 0.038232088, 0.11434216, 0.11124476, 0.0025760008, 0.026893888, 0.16811757, -0.44396934, -0.04617529, 0.031247612) * inp_0_2_2;
    result1 += M4(-0.001829124, -0.0042460826, -0.045003258, -0.011729766, 0.041211355, -0.0173165, -0.008063172, 0.06507508, 0.04243883, -0.025628429, 0.09850709, 0.0047282656, -0.50648594, 0.04286857, 0.18831971, -0.08270936) * inp_0_0_0;
    result1 += M4(0.11643698, -0.016322708, 0.07533118, -0.18811043, 0.08986711, -0.04125679, -0.09450077, 0.15793636, 0.040403467, -0.06335698, 0.1966251, 0.11354182, -0.1501855, 0.07315903, -0.30212653, -0.10489276) * inp_0_1_0;
    result1 += M4(0.012242672, 0.0049883267, -0.043424625, 0.0031150642, 0.11279196, 0.034418993, 0.16808882, -0.09977209, 0.021759149, -0.01712725, -0.044776965, 0.06026308, -0.062080197, -0.029737376, -0.03129875, 0.15636036) * inp_0_2_0;
    result1 += M4(0.030130042, -0.018458083, -0.15381266, 0.046829976, -0.026979234, 0.11408765, 0.05566549, -0.17399815, 0.017612645, 0.04356967, 0.30764085, -0.15369977, 0.09204335, -0.14411147, 0.37210715, 0.32997265) * inp_0_0_1;
    result1 += M4(0.019590132, -0.07640375, 0.74855137, -0.37330675, -0.07635478, 0.4435983, -0.14112692, 0.57418364, -0.037828278, 0.055201348, -0.14108154, 1.0358009, -0.1407098, -0.27115607, -0.98156774, -0.58749133) * inp_0_1_1;
    result1 += M4(0.028695967, -0.058144264, -0.38400516, 0.23668939, 0.011170075, 0.09297126, 0.057955123, -0.12714618, -0.0061560003, -0.038590845, 0.18309161, -0.10017276, -0.014851744, 0.042434927, 0.34019315, 0.06314079) * inp_0_2_1;
    result1 += M4(-0.014526362, 0.012658364, -0.043502677, 0.1593062, -0.00779308, -0.010068203, 0.043102052, 0.12384872, -0.016344436, 0.006179394, -0.039794296, 0.024295827, -0.009175038, 0.13019551, -0.016776806, -0.30826214) * inp_0_0_2;
    result1 += M4(0.020141218, 0.2777369, -0.018752351, -0.27199933, 0.011231427, -0.4189904, -0.0020257703, -0.16548344, -0.014807106, 0.12145978, 0.06487422, 0.31668973, -0.03364522, 0.51271206, -0.03659611, 0.5759994) * inp_0_1_2;
    result1 += M4(0.01970719, 0.3230731, -0.03802732, 0.10350769, 0.019841403, -0.057225563, 0.026538095, 0.19222069, -0.019705452, 0.057873633, -0.054833554, -0.19402951, -0.00052742346, 0.11453526, 0.08675024, -0.32749745) * inp_0_2_2;
    const V4 inp_1_0_0 = inp[1][local_xy.y + 0][local_xy.x + 0];
    const V4 inp_1_1_0 = inp[1][local_xy.y + 0][local_xy.x + 1];
    const V4 inp_1_2_0 = inp[1][local_xy.y + 0][local_xy.x + 2];
    const V4 inp_1_0_1 = inp[1][local_xy.y + 1][local_xy.x + 0];
    const V4 inp_1_1_1 = inp[1][local_xy.y + 1][local_xy.x + 1];
    const V4 inp_1_2_1 = inp[1][local_xy.y + 1][local_xy.x + 2];
    const V4 inp_1_0_2 = inp[1][local_xy.y + 2][local_xy.x + 0];
    const V4 inp_1_1_2 = inp[1][local_xy.y + 2][local_xy.x + 1];
    const V4 inp_1_2_2 = inp[1][local_xy.y + 2][local_xy.x + 2];
    result0 += M4(0.02896445, -0.019500058, -0.20887016, 0.167427, -0.10857209, 0.015550776, -0.25798947, -0.036908545, 0.33709246, 0.0010951554, 0.27672458, 0.20829159, 0.0433901, -0.033074945, 0.113953255, 0.020882355) * inp_1_0_0;
    result0 += M4(0.5359431, 0.030400937, -0.11510958, 0.027841222, -0.313336, -0.000651533, -0.11134012, 0.2209598, 0.7348431, 0.0047070053, 0.001915848, 0.42694798, 0.052847683, 0.028956084, 0.098483786, -0.12588383) * inp_1_1_0;
    result0 += M4(-0.050897334, 0.01297826, 0.051064905, -0.07874486, -0.1348973, -0.019120563, 0.027248459, -0.0726528, 0.29354712, -0.027323926, 0.053622663, -0.007291186, 0.15766475, 0.009535758, -0.011531149, 0.050886344) * inp_1_2_0;
    result0 += M4(-0.17959939, -0.028911795, -0.22084586, -0.13574629, -0.13546209, 0.020032845, -0.58026844, 0.1740421, -0.13824134, 0.033610854, -0.6895689, 0.051613063, -0.1420372, -0.039808277, -0.28533092, -0.19097781) * inp_1_0_1;
    result0 += M4(-0.026524602, 0.08310219, 0.03911609, -0.040710855, -0.38926157, 0.046499655, -0.01643678, -0.6285207, -0.376694, -0.04628656, 0.11891003, -0.08769094, 0.46731564, 0.0032184166, 0.16490273, 0.40532538) * inp_1_1_1;
    result0 += M4(0.01196172, -0.009028805, -0.003544086, 0.010458567, 0.022031488, -0.24050707, -0.023512868, -0.028953139, 0.091239326, -0.00010132099, -0.06527006, 0.14296758, -0.04341379, 0.15624502, -0.020782096, 0.013121614) * inp_1_2_1;
    result0 += M4(0.025940973, -0.004010571, -0.45591572, -0.027445644, 0.05978278, 0.03160253, -0.07442522, -0.41885218, 0.02854667, -0.0106050195, -1.0329479, 0.14139478, -0.08663926, -0.08029817, 0.21145512, 0.026760614) * inp_1_0_2;
    result0 += M4(0.06566973, 0.009918716, -0.03631913, 0.03671472, 0.37021992, -0.16043891, 0.0038566992, 0.17818058, 0.05782015, 0.0064971577, 0.06291715, 0.10767377, -0.8796218, 0.2662835, 0.013351336, -0.6134637) * inp_1_1_2;
    result0 += M4(0.008632548, -0.017318286, -0.024088413, 0.008102584, 0.124653354, -0.087286994, 0.040308002, 0.028495068, -0.008205336, -0.005723902, -0.012497305, 0.05444067, -0.24626835, -0.036161087, -0.008364729, -0.15170264) * inp_1_2_2;
    result1 += M4(-0.032349497, 0.041516382, -0.03595781, 0.013007499, 0.076520726, 0.027025737, 0.21323368, -0.031516753, -0.003554791, 0.041286003, -0.02512802, -0.28346682, 0.14297897, -0.023353815, -0.22691202, 0.0426327) * inp_1_0_0;
    result1 += M4(-0.06987926, 0.064853884, 0.26980162, -0.0714327, -0.27180284, 0.033226404, -0.51668125, -0.13756119, -0.1314211, -0.22162539, -0.1108404, 0.36452547, 0.34602174, 0.0037092045, 0.49219087, -0.00634266) * inp_1_1_0;
    result1 += M4(-0.039610256, -0.031744383, -0.19099727, -0.04923513, -0.045457784, -0.049151078, -0.0026129757, 0.017248567, -0.0247747, 0.059482098, 0.14844625, -0.18183455, 0.04025794, 0.052286364, -0.062804036, 0.070049584) * inp_1_2_0;
    result1 += M4(0.026138833, -0.07073946, -0.019706476, 0.057851937, 0.086019866, -0.06731423, -0.051327072, -0.45810297, 0.019237107, 0.08426277, -0.09670646, 0.00090766547, -0.054674566, 0.029158289, 0.021574559, 0.21188955) * inp_1_0_1;
    result1 += M4(0.05387268, -0.19674867, 0.10450181, -0.34803218, 0.029806258, -0.31820786, -0.4574834, 0.43130645, 0.038197413, 0.311382, 0.019783724, -0.23793368, 0.06658028, 0.24702781, 0.2493446, -0.33598128) * inp_1_1_1;
    result1 += M4(0.013441342, 0.0091013415, -0.048972968, 0.26746696, -0.01301776, -0.07439922, 0.098007515, -0.16091694, 0.055190634, 0.062807634, 0.06275509, 0.043149263, -0.08796758, 0.023066182, -0.020796193, 0.19874844) * inp_1_2_1;
    result1 += M4(0.0169232, -0.03715479, -0.021770718, -0.116751656, 0.017530631, -0.1089796, -0.03351969, -0.13320439, -0.015331135, 0.11271626, -0.00805957, 0.014932082, -0.035296913, -0.07758614, -0.14463261, 0.20321462) * inp_1_0_2;
    result1 += M4(0.008426807, 0.22596078, -0.033489026, 0.046253152, 0.004117761, 0.1290475, -0.0036282432, -0.016447363, -0.02630206, 0.24279352, -0.014771361, -0.049273532, 0.049909342, -0.8056405, 0.30325535, -0.5089334) * inp_1_1_2;
    result1 += M4(-0.0066918205, 0.023497809, -0.015725413, -0.024844777, 0.011361604, 0.025336564, 0.033177715, -0.10866082, 0.019615075, 0.070816986, 0.0004438535, 0.0072612315, -0.013708224, -0.28692755, -0.20268214, 0.22024308) * inp_1_2_2;
    const ivec2 output_base = ivec2(gl_GlobalInvocationID) * ivec2(2, 1);
    imageStore(out_image, output_base + ivec2(0, 0), max(result0, V4(0.0)));
    imageStore(out_image, output_base + ivec2(1, 0), max(result1, V4(0.0)));
}

//!DESC [ArtCNN_v0_C4F8_DS_CMP] (Conv2D-5)
//!COMPUTE 24 16 12 16
//!HOOK LUMA
//!BIND conv2d_4
//!SAVE conv2d_5
//!WIDTH LUMA.w 2.0 *
//!HEIGHT LUMA.h 1.0 *
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif

const ivec2 ksize = ivec2(3, 3);
const ivec2 offset = ksize / 2;
const ivec2 wg_size = ivec2(gl_WorkGroupSize);
const ivec2 isize = wg_size + ksize - 1;
shared V4 inp[2][isize.y][isize.x];
void hook() {
    const uvec2 local_xy = gl_LocalInvocationID.xy;
    ivec2 base = ivec2(gl_WorkGroupID) * wg_size;
    for (uint y = local_xy.y; y < isize.y; y += wg_size.y) {
        for (uint x = local_xy.x; x < isize.x; x += wg_size.x) {
            const ivec2 input_base = (base + ivec2(x,y) - offset) * ivec2(2, 1);
            inp[0][y][x] = V4(conv2d_4_mul * texelFetch(conv2d_4_raw, input_base + ivec2(0, 0), 0));
            inp[1][y][x] = V4(conv2d_4_mul * texelFetch(conv2d_4_raw, input_base + ivec2(1, 0), 0));
        }
    }

    barrier();
    V4 result0 = V4(-0.004281367, -0.06462364, -0.07839336, -0.051348798);
    V4 result1 = V4(-0.044737395, -0.05127203, 0.04243012, 0.02257549);
    const V4 inp_0_0_0 = inp[0][local_xy.y + 0][local_xy.x + 0];
    const V4 inp_0_1_0 = inp[0][local_xy.y + 0][local_xy.x + 1];
    const V4 inp_0_2_0 = inp[0][local_xy.y + 0][local_xy.x + 2];
    const V4 inp_0_0_1 = inp[0][local_xy.y + 1][local_xy.x + 0];
    const V4 inp_0_1_1 = inp[0][local_xy.y + 1][local_xy.x + 1];
    const V4 inp_0_2_1 = inp[0][local_xy.y + 1][local_xy.x + 2];
    const V4 inp_0_0_2 = inp[0][local_xy.y + 2][local_xy.x + 0];
    const V4 inp_0_1_2 = inp[0][local_xy.y + 2][local_xy.x + 1];
    const V4 inp_0_2_2 = inp[0][local_xy.y + 2][local_xy.x + 2];
    result0 += M4(-0.01830601, -0.06514294, 0.0033470413, -0.012838366, 0.027676288, 0.5378215, -0.46544513, -0.057919182, 0.021866966, -0.046719722, -0.027383242, 0.06306242, 0.030139234, -0.046567786, 0.0070544034, -0.014359486) * inp_0_0_0;
    result0 += M4(-0.009068078, 0.079082526, -0.11168057, -0.0029682794, 0.042158943, -0.1615447, 0.32967436, 0.07279303, 0.07938794, -0.06177304, -0.24630594, 0.119435444, -0.0026040478, 0.001834199, 0.06263507, 0.017454457) * inp_0_1_0;
    result0 += M4(0.0005841043, 0.0044725635, 0.04523616, 0.00439845, 0.0017313332, -0.0076472224, -0.016208844, -0.016707128, -0.20618512, 0.7260729, -0.44414148, 0.00524022, 0.031237518, 0.025385408, -0.01568181, 0.010176574) * inp_0_2_0;
    result0 += M4(0.16000378, 0.0452868, -0.24289155, 0.11479462, -0.08199818, -0.10629858, 0.21941172, -0.12040541, 0.03795813, 0.05538379, 0.018173378, -0.059650656, 0.04337613, 0.10825685, -0.10584103, 0.16212265) * inp_0_0_1;
    result0 += M4(-0.3088016, -0.3237014, 0.55236584, -0.19437416, 0.024665684, -0.3314157, -0.057624705, 0.1524958, 0.5077107, 0.5176829, -0.082513124, 0.18892774, -0.048950084, -0.23500404, 0.3625795, -0.8610678) * inp_0_1_1;
    result0 += M4(-0.009586561, 0.095529184, -0.031118764, -0.0105636595, 0.0059906715, 0.12091095, -0.023144975, -0.0095136, -0.10739631, 0.19548866, -0.19744982, 0.09238595, -0.08407183, -0.5483232, -0.040131737, 0.12753265) * inp_0_2_1;
    result0 += M4(0.040191088, 0.021246016, -0.05649652, -0.04330853, -0.008279291, -0.01418539, -0.0052699465, -0.056048326, 0.0033275655, 0.017330794, -0.018018078, 0.013559137, 0.0021921846, 0.017184587, -0.039381616, 0.050694995) * inp_0_0_2;
    result0 += M4(-0.008553061, 0.18836685, 0.14944008, -0.07758059, -0.02313522, -0.043654464, -0.0041809934, 0.007794027, -0.020844776, -0.06289665, 0.057676286, -0.22661702, -0.027776206, 0.008943579, -0.03747756, -0.07517932) * inp_0_1_2;
    result0 += M4(-0.0075548682, 0.011746554, -0.034675855, -0.020136843, 0.009281594, -0.00313991, -0.015858088, 0.012087803, -0.015303657, -0.10712728, -0.38658586, 0.16948155, 0.010062341, -0.029228298, -0.07722699, -0.009567893) * inp_0_2_2;
    result1 += M4(0.0073817717, -0.0019233908, -0.034115165, -0.018315226, -0.25292194, -0.38935155, 0.172865, 0.3577865, -0.009320541, 0.08279134, -0.05538394, -0.029554628, -0.015672183, 0.06923712, 0.029267587, -0.07237932) * inp_0_0_0;
    result1 += M4(-0.032506377, 0.00048467808, 0.063813865, -0.008346779, -0.24250895, 0.32901675, -0.090850174, -0.11149376, -0.14019696, 0.40758872, -0.15324895, -0.13439961, 0.09043152, -0.10180827, 0.0462725, 0.12740959) * inp_0_1_0;
    result1 += M4(0.031694368, -0.0084457, -0.0007406618, 0.0090185, -0.10183446, -0.06498661, 0.050930828, -0.032385416, 0.44235155, 0.8216581, -0.5294252, 0.03230336, -0.068449736, 0.028927967, -0.011597542, 0.0026664932) * inp_0_2_0;
    result1 += M4(-0.20813712, 0.29896358, 0.0040965206, -0.15766911, 0.03786456, -0.12379672, 0.18037811, -0.21134034, -0.056972563, -0.015926685, 0.06856165, -0.02394938, -0.13099077, 0.14859757, -0.079402156, -0.017835163) * inp_0_0_1;
    result1 += M4(-0.20450954, -1.0352604, 0.2019429, -0.23243459, 0.0020717112, -0.175872, 0.022999937, -0.154238, -0.39868298, -0.22012277, 0.27588624, 0.091856666, 0.19131221, -0.7717992, 0.3209731, -0.11329021) * inp_0_1_1;
    result1 += M4(0.19233546, 0.10114675, -0.02973205, 0.13707553, 0.1083046, -0.06320472, -0.04208164, 0.011700326, 1.3716125, 0.31816658, -0.23687129, 0.31608805, 0.3219586, 0.053873044, -0.35191077, -0.20781805) * inp_0_2_1;
    result1 += M4(-0.018891588, 0.045092393, 0.024048941, -0.0038159497, 0.033036567, 0.041276384, 0.02624164, -0.003581289, -0.0054893796, -0.012374576, 0.012760429, 0.019909162, -0.032665227, 0.030287473, -0.029600834, 0.019104347) * inp_0_0_2;
    result1 += M4(-0.035008386, 0.005900277, -0.13512497, 0.04276038, -0.051221836, -0.01500849, -0.024707671, -0.051521074, 0.07161642, -0.14341873, 0.23306912, 0.018626956, -0.021292191, -0.14911574, -0.05760247, -0.04415544) * inp_0_1_2;
    result1 += M4(-0.017057797, 0.14626054, 0.0055983835, 0.066157766, 0.0030631344, 0.03228664, 0.031747658, 0.018265955, 0.2312253, 0.63441026, 0.2710837, 0.30743408, -0.06542492, -0.07794309, -0.0687424, -0.09177362) * inp_0_2_2;
    const V4 inp_1_0_0 = inp[1][local_xy.y + 0][local_xy.x + 0];
    const V4 inp_1_1_0 = inp[1][local_xy.y + 0][local_xy.x + 1];
    const V4 inp_1_2_0 = inp[1][local_xy.y + 0][local_xy.x + 2];
    const V4 inp_1_0_1 = inp[1][local_xy.y + 1][local_xy.x + 0];
    const V4 inp_1_1_1 = inp[1][local_xy.y + 1][local_xy.x + 1];
    const V4 inp_1_2_1 = inp[1][local_xy.y + 1][local_xy.x + 2];
    const V4 inp_1_0_2 = inp[1][local_xy.y + 2][local_xy.x + 0];
    const V4 inp_1_1_2 = inp[1][local_xy.y + 2][local_xy.x + 1];
    const V4 inp_1_2_2 = inp[1][local_xy.y + 2][local_xy.x + 2];
    result0 += M4(-0.0125813335, 0.04456755, 0.031095464, -0.033768643, -0.025474552, 0.047821194, 0.2972199, -0.075755104, 0.02107722, 0.035903297, -0.02178329, 0.038003445, 0.008078985, 0.036955025, 0.04548918, -0.048981197) * inp_1_0_0;
    result0 += M4(0.015482036, -0.11684231, -0.040843654, 0.039869566, 0.31621397, 0.68449026, -1.2492493, 0.51071644, 0.035538368, 0.36147186, 0.17711394, -0.020820227, -0.03727489, -0.33107185, -0.120441504, -0.00014320384) * inp_1_1_0;
    result0 += M4(-0.0052193995, 0.048508298, -0.013399912, -0.019454086, 0.09151539, 0.3380062, 0.15445106, -0.010792471, -0.0012494252, -0.035924084, 0.0069643753, 0.005317227, -0.008625564, 0.12795319, -0.043141477, -0.006243163) * inp_1_2_0;
    result0 += M4(-0.07572121, -0.045096144, -0.059617266, -0.031384684, 0.12259394, -0.07052231, -0.019237325, 0.06970596, -0.11495331, -0.005277858, 0.31447667, -0.2623092, 0.018115982, 0.06837497, -0.29947865, 0.40838242) * inp_1_0_1;
    result0 += M4(0.15679273, 0.015594048, 0.07415355, -0.008660666, -0.25030047, -0.23670597, -0.01042092, 0.5379407, -0.35237494, -0.8694414, 0.10242005, 0.008464832, 0.6941308, 0.98734033, -0.43031588, 0.46666718) * inp_1_1_1;
    result0 += M4(-0.060582668, -0.24720898, -0.011741603, 0.052312944, 0.07233516, 0.7304338, 0.09665157, -0.09296339, 0.017674616, -0.074917905, -0.022774164, 0.05449548, -0.027425107, 0.14992653, -0.04792233, -0.05207528) * inp_1_2_1;
    result0 += M4(0.0045490563, 0.008319107, -0.16259977, -0.0032112822, 0.0029059888, -0.022009254, -0.04040798, -0.040270478, -0.055452425, -0.078190826, 0.086357, -0.038482014, 0.004310114, -0.020079859, 0.05782932, -0.06571385) * inp_1_0_2;
    result0 += M4(-0.34668237, -0.86609447, 0.77933687, 0.009690252, 0.03465742, 0.16602355, 0.18746021, 0.032913435, 0.022009058, -0.16650781, -0.1685848, 0.055449747, -0.080491684, -0.11081864, 0.04043112, -0.02708748) * inp_1_1_2;
    result0 += M4(-0.068565525, 0.06624503, -0.0767491, -0.05944897, -0.022932013, -0.050061323, 0.043518413, 0.022343915, 0.0132718, -0.015895389, -0.040303685, 0.0074170083, -0.014733233, 0.010580786, 0.07760989, 0.0048705395) * inp_1_2_2;
    result1 += M4(0.027421322, -0.08443544, 0.051321708, 0.052769598, 0.22831655, -0.15846136, -0.13609648, 0.14715704, -0.06338576, -0.015010224, 0.029399354, -0.014116535, 0.09017143, 0.04507385, 0.01744074, 0.057826526) * inp_1_0_0;
    result1 += M4(-0.037732884, 0.07883075, -0.03988281, -0.085846916, 0.4637774, 1.6967342, 0.44728324, 0.9436958, 0.17161308, -0.08917062, 0.09385671, 0.29406157, -0.102831356, 0.08537221, 0.13619746, -0.19656143) * inp_1_1_0;
    result1 += M4(0.02747906, 0.021771466, -0.00494075, 0.021968542, -0.40014756, -0.085813195, -0.090677544, -0.07061847, -0.057660766, -0.018100219, -0.012795681, -0.030393332, 0.16701369, 0.012351983, 0.046995018, 0.10220106) * inp_1_2_0;
    result1 += M4(0.10884264, 0.10220737, -0.03403188, -0.050782014, 0.063556865, 0.098558396, -0.05480942, -0.12571885, 0.18607329, -0.40140593, 0.16007605, 0.15682594, -0.15406585, 0.15826064, -0.3720378, 0.048261862) * inp_1_0_1;
    result1 += M4(-0.2537625, -0.035236817, 0.17701815, -0.06636021, -0.20040071, 0.4754537, 0.22263189, 0.33746874, 1.2475417, 0.98205847, -0.69257873, 0.10022214, -1.3227742, 0.097527154, -0.30353665, -0.18684489) * inp_1_1_1;
    result1 += M4(0.011863927, -0.07393665, -0.04884496, -0.103234835, -0.29703724, -0.0019379555, 0.175674, 0.23035428, 0.09474622, 0.15701528, -0.034919288, 0.032427303, -0.058852024, -0.20387323, -0.1627409, -0.057415158) * inp_1_2_1;
    result1 += M4(-0.11599976, -0.18028347, 0.031545665, -0.021123331, 0.04432434, 0.049128156, 0.032133788, 0.006227137, 0.085671484, -0.06387286, 0.015659368, -0.019906677, 0.014686817, -0.029591775, 0.0753203, -0.0139786955) * inp_1_0_2;
    result1 += M4(0.1795088, -0.2639199, -0.5535018, -0.4692517, -0.16589205, -0.0010424146, -0.10296826, -0.05240729, 0.012541238, 0.10324619, 0.21597819, 0.022869963, 0.22207361, 0.04337836, 0.05681577, 0.05451058) * inp_1_1_2;
    result1 += M4(-0.5315139, -0.17369154, -0.04726952, -0.3046707, 0.13530041, 0.04440296, 0.06211743, 0.068090685, 0.030161986, -0.05233274, 0.091526896, 0.03369936, 0.018692978, 0.038468525, 0.01363924, 0.039678745) * inp_1_2_2;
    const ivec2 output_base = ivec2(gl_GlobalInvocationID) * ivec2(2, 1);
    imageStore(out_image, output_base + ivec2(0, 0), result0);
    imageStore(out_image, output_base + ivec2(1, 0), result1);
}

//!DESC [ArtCNN_v0_C4F8_DS_CMP] (Conv2D-6)
//!COMPUTE 12 16 12 16
//!HOOK LUMA
//!BIND conv2d
//!BIND conv2d_5
//!SAVE conv2d_6
//!WIDTH LUMA.w 1.0 *
//!HEIGHT LUMA.h 1.0 *
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif

const ivec2 ksize = ivec2(3, 3);
const ivec2 offset = ksize / 2;
const ivec2 wg_size = ivec2(gl_WorkGroupSize);
const ivec2 isize = wg_size + ksize - 1;
shared V4 inp[2][isize.y][isize.x];
void hook() {
    const uvec2 local_xy = gl_LocalInvocationID.xy;
    ivec2 base = ivec2(gl_WorkGroupID) * wg_size;
    for (uint y = local_xy.y; y < isize.y; y += wg_size.y) {
        for (uint x = local_xy.x; x < isize.x; x += wg_size.x) {
            const ivec2 input_base = (base + ivec2(x,y) - offset) * ivec2(2, 1);
            inp[0][y][x] = V4(conv2d_5_mul * texelFetch(conv2d_5_raw, input_base + ivec2(0, 0), 0) + conv2d_mul * texelFetch(conv2d_raw, input_base + ivec2(0, 0), 0));
            inp[1][y][x] = V4(conv2d_5_mul * texelFetch(conv2d_5_raw, input_base + ivec2(1, 0), 0) + conv2d_mul * texelFetch(conv2d_raw, input_base + ivec2(1, 0), 0));
        }
    }

    barrier();
    V4 result0 = V4(0.11104061, 0.16424017, 0.14968067, 0.17041415);
    const V4 inp_0_0_0 = inp[0][local_xy.y + 0][local_xy.x + 0];
    const V4 inp_0_1_0 = inp[0][local_xy.y + 0][local_xy.x + 1];
    const V4 inp_0_2_0 = inp[0][local_xy.y + 0][local_xy.x + 2];
    const V4 inp_0_0_1 = inp[0][local_xy.y + 1][local_xy.x + 0];
    const V4 inp_0_1_1 = inp[0][local_xy.y + 1][local_xy.x + 1];
    const V4 inp_0_2_1 = inp[0][local_xy.y + 1][local_xy.x + 2];
    const V4 inp_0_0_2 = inp[0][local_xy.y + 2][local_xy.x + 0];
    const V4 inp_0_1_2 = inp[0][local_xy.y + 2][local_xy.x + 1];
    const V4 inp_0_2_2 = inp[0][local_xy.y + 2][local_xy.x + 2];
    result0 += M4(0.047740985, -0.06915975, -0.12840918, -0.13696516, -0.019822922, 0.0051126992, 0.011029036, 0.01388047, -0.12527637, 0.0045710327, 0.01367824, 0.028006023, 0.15036204, -0.023977028, -0.030819356, -0.06957354) * inp_0_0_0;
    result0 += M4(0.26731613, 0.022145858, 0.19484667, 0.08164394, -0.05017701, -0.048245452, 0.031440467, 0.06063307, 0.0025612568, 0.021732062, 0.0017977183, 0.02738137, 0.18954153, 0.01947673, -0.0439546, -0.055427354) * inp_0_1_0;
    result0 += M4(0.0041335737, 0.19402702, -0.03757624, 0.075821124, 0.025172025, -0.032536674, 0.019654278, -0.00016893227, 0.009915462, -0.0440859, 0.017372135, -0.011282794, -0.051785924, -0.0060581383, -0.05808605, -0.057233702) * inp_0_2_0;
    result0 += M4(-0.050189953, -0.12238486, 0.045299105, -0.012730087, -0.18205447, 0.14304484, -0.23102546, 0.15107012, 0.19224846, 0.021200405, 0.096434854, 0.04024859, 0.049534317, -0.036928795, 0.022531621, -0.14491597) * inp_0_0_1;
    result0 += M4(0.3085215, 0.41982302, 0.1686751, -0.03160949, 0.31372598, -0.041285247, 0.3667674, -0.082169846, 0.3242587, 0.74179435, 0.26368502, 0.48967704, 0.28606808, 0.21279083, 0.6899051, 0.5170644) * inp_0_1_1;
    result0 += M4(0.26978153, 0.07772519, 0.26085314, 0.28998, -0.02054421, -0.052469943, 0.0018648807, -0.035895158, 0.06726936, 0.048139367, 0.06292994, 0.007206501, -0.3252579, 0.05825191, -0.25152895, 0.17703794) * inp_0_2_1;
    result0 += M4(0.084693275, 0.07200484, 0.03905112, -0.02012441, -0.028879391, 0.014057252, -0.13124909, 0.056587372, -0.006813429, -0.019035023, 0.027532017, -0.022406083, -0.0031911586, 0.022163885, -0.053572964, -0.025793271) * inp_0_0_2;
    result0 += M4(0.10437877, 0.15569969, 0.23475231, 0.35180202, -0.005266871, -0.014464912, 0.035520807, -0.2634638, -0.011925163, -0.007664887, 0.04776647, 0.18550937, -0.032652896, -0.05013306, 0.061106507, -0.023583002) * inp_0_1_2;
    result0 += M4(-0.054981753, -0.011175018, 0.09312574, 0.13339762, 0.021055121, 0.011681347, -0.008994018, -0.08601141, -0.01060539, -0.0017003864, 0.009206573, 0.006515732, 0.004922155, -0.02756114, -0.14444065, -0.07339691) * inp_0_2_2;
    const V4 inp_1_0_0 = inp[1][local_xy.y + 0][local_xy.x + 0];
    const V4 inp_1_1_0 = inp[1][local_xy.y + 0][local_xy.x + 1];
    const V4 inp_1_2_0 = inp[1][local_xy.y + 0][local_xy.x + 2];
    const V4 inp_1_0_1 = inp[1][local_xy.y + 1][local_xy.x + 0];
    const V4 inp_1_1_1 = inp[1][local_xy.y + 1][local_xy.x + 1];
    const V4 inp_1_2_1 = inp[1][local_xy.y + 1][local_xy.x + 2];
    const V4 inp_1_0_2 = inp[1][local_xy.y + 2][local_xy.x + 0];
    const V4 inp_1_1_2 = inp[1][local_xy.y + 2][local_xy.x + 1];
    const V4 inp_1_2_2 = inp[1][local_xy.y + 2][local_xy.x + 2];
    result0 += M4(0.23778167, -0.071911685, -0.07418469, -0.12794943, -0.119942605, 0.033319514, 0.029954946, 0.069486015, 0.044830173, -0.010898266, 0.004998177, -0.0006213797, -0.12461627, 0.0398384, 0.032540638, 0.060666014) * inp_1_0_0;
    result0 += M4(-0.14298241, -0.00087986584, -0.018375447, -0.0021809211, -0.05405992, 0.0006722785, -0.035345253, -0.02380576, -0.023342691, 0.15823255, 0.02380151, 0.029103812, 0.17818204, 0.017464548, -0.07361105, -0.10984875) * inp_1_1_0;
    result0 += M4(0.036412574, -0.003595665, 0.02716239, 0.031818748, 0.022395257, -0.049635403, 0.026593363, -0.011147749, 0.033951733, -0.019617874, 0.014626017, 0.014490228, -0.104675874, 0.044413727, -0.06271134, -0.04739214) * inp_1_2_0;
    result0 += M4(-0.2515852, -0.04612317, -0.10147058, -0.046280578, 0.02493537, -0.0397943, 0.10132346, 0.011108449, -0.038676765, -0.05952748, 0.0055196867, -0.16050388, -0.006146873, -0.13461623, 0.24933182, -0.048735507) * inp_1_0_1;
    result0 += M4(0.11977903, 0.20816405, 0.10042732, 0.21991056, -0.34656197, 0.005211325, -0.053603142, 0.2559931, 0.1686257, -0.053661104, 0.15230268, 0.49745658, 0.0697554, -0.21057598, -0.047538087, 0.03024779) * inp_1_1_1;
    result0 += M4(0.07098654, -0.033596646, 0.06400351, -0.03440024, 0.061072867, -0.065155946, 0.052914225, -0.08014609, -0.06407926, -0.097300336, -0.00053515454, -0.12330818, -0.059263244, 0.22693981, -0.14654967, 0.1677643) * inp_1_2_1;
    result0 += M4(0.033585005, 0.0651778, -0.13221596, 0.010309706, -0.0051567503, -0.03547191, -0.009153362, -0.03140121, -0.010529688, -0.0036931508, -0.05555019, -0.024437945, -0.044085715, -0.043507773, 0.045291543, -0.025050528) * inp_1_0_2;
    result0 += M4(0.05702522, 0.07238266, 0.09935222, -0.05793195, 0.05556891, 0.0285773, -0.14637265, -0.1052093, -0.06201566, -0.06499971, 0.08710784, -0.06557595, -0.015302304, 0.0016559373, 0.026654541, 0.17367116) * inp_1_1_2;
    result0 += M4(0.004902699, 0.02628713, 0.010531309, -0.032660607, 0.022209503, 0.024280963, 0.018250063, -0.06750173, 0.033424184, 0.02307127, -0.037525203, -0.07885216, -0.05434476, -0.05075653, 0.049834635, 0.21880029) * inp_1_2_2;
    const ivec2 output_base = ivec2(gl_GlobalInvocationID) * ivec2(1, 1);
    imageStore(out_image, output_base + ivec2(0, 0), result0);
}

//!DESC [ArtCNN_v0_C4F8_DS_CMP] (Depth-To-Space)
//!COMPUTE 12 16 12 16
//!HOOK LUMA
//!BIND conv2d_6
//!WIDTH LUMA.w 2.0 *
//!HEIGHT LUMA.h 2.0 *
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif

void hook() {
    vec4 result = vec4(0.0, 0.0, 0.0, 1.0);
    vec2 f0 = fract(conv2d_6_pos * conv2d_6_size);
    ivec2 i0 = ivec2(f0 * vec2(2.0));
    result.x = conv2d_6_tex((vec2(0.5) - f0) * conv2d_6_pt + conv2d_6_pos)[i0.y * 2 + i0.x];
    imageStore(out_image, ivec2(gl_GlobalInvocationID), clamp(result, 0.0, 1.0));
}
